import time
import urllib3
import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def get_detail_url(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
    try:
        r = requests.get(url, headers=headers, verify=False)
        html = etree.HTML(r.text)
        # print(html)
        print('------------------------------------')
        # 经过上述跳转，得到目标网页地址
        # roal_url = html.xpath('//h4/a/@href')
        # print(roal_url)
        roal_url = html.xpath('//*[@id="kesfqbfylb_A01_01_03"]/dt/a/@href')
        # // *[ @ id = "kesfqbfylb_A01_01_04"] / dd[1] / h4
        # // *[ @ id = "kesfqbfylb_A01_01_03"] / dd[1] / h4 / a
        # // *[ @ id = "kesfqbfylb_A01_01_03"] / dd[1]
        # // *[ @ id = "kesfqbfylb_A01_01_03"] / dt
        # // *[ @ id = "kesfqbfylb_A01_01_03"]

        # driver = webdriver.Firefox()
        # driver.get(url)
        # t1=driver.find_elements(By.CLASS_NAME,"clearfix")
        # print(t1)


        test1='//*[@id="kesfqbfylb_A01_01_04"]/dd[1]/h4/a'
        test2='//*[@id="kesfqbfylb_A01_01_04"]/dd[1]/h4/a'
        test3='//*[@id="kesfqbfylb_A01_01_05"]/dd[1]/h4/a'
        for i in range(0,len(roal_url)):

            roal_url[i] = "https://lz.esf.fang.com" + roal_url[i] + f"?channel=1,2&psid=1_{i+1}_60"
            print(roal_url[i])


            r = requests.get(roal_url[i], headers=headers, verify=False)
            html = etree.HTML(r.text)
            hrefs = html.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@href')
            channels = html.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@data_channel')
            next_urls = ['https://lz.esf.fang.com' + href +'?channel=' + channel for href,channel in zip(hrefs,channels)]
            house.extend(next_urls)
    except:
        process_captcha()
        get_detail_url(url)

def process_captcha():

    # 该处url是让出现验证码界面，没有具体的限制
    url = 'https://lz.esf.fang.com/chushou/3_416752691.htm?channel=2,2'
    driver = webdriver.Firefox()
    driver.get(url)
    # 人工输入验证码
    time.sleep(10)
    driver.find_element(By.NAME,'submit').click()
    driver.close()







if __name__ == '__main__':

    house = []
    for i in range(1,5):
        print('-'*36)
        print(f'正在爬取第{i}页...')
        url = f'https://lz.esf.fang.com/house/i3{i}/'
        get_detail_url(url)
    print('爬取结束!!!!!!')

    f = open('urls.txt', 'a+', encoding='utf8')
    for i in house:
        f.write(i + '\n')
    f.close()